Analisis Exploratorio¶
InĀ [2]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels.stats.diagnostic as diag
import statsmodels.api as sm
import funciones
InĀ [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
#Importar Datos
house_prices=pd.read_csv("train.csv")
InĀ [4]:
print(house_prices.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1460 entries, 0 to 1459 Data columns (total 81 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Id 1460 non-null int64 1 MSSubClass 1460 non-null int64 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 LotArea 1460 non-null int64 5 Street 1460 non-null object 6 Alley 91 non-null object 7 LotShape 1460 non-null object 8 LandContour 1460 non-null object 9 Utilities 1460 non-null object 10 LotConfig 1460 non-null object 11 LandSlope 1460 non-null object 12 Neighborhood 1460 non-null object 13 Condition1 1460 non-null object 14 Condition2 1460 non-null object 15 BldgType 1460 non-null object 16 HouseStyle 1460 non-null object 17 OverallQual 1460 non-null int64 18 OverallCond 1460 non-null int64 19 YearBuilt 1460 non-null int64 20 YearRemodAdd 1460 non-null int64 21 RoofStyle 1460 non-null object 22 RoofMatl 1460 non-null object 23 Exterior1st 1460 non-null object 24 Exterior2nd 1460 non-null object 25 MasVnrType 588 non-null object 26 MasVnrArea 1452 non-null float64 27 ExterQual 1460 non-null object 28 ExterCond 1460 non-null object 29 Foundation 1460 non-null object 30 BsmtQual 1423 non-null object 31 BsmtCond 1423 non-null object 32 BsmtExposure 1422 non-null object 33 BsmtFinType1 1423 non-null object 34 BsmtFinSF1 1460 non-null int64 35 BsmtFinType2 1422 non-null object 36 BsmtFinSF2 1460 non-null int64 37 BsmtUnfSF 1460 non-null int64 38 TotalBsmtSF 1460 non-null int64 39 Heating 1460 non-null object 40 HeatingQC 1460 non-null object 41 CentralAir 1460 non-null object 42 Electrical 1459 non-null object 43 1stFlrSF 1460 non-null int64 44 2ndFlrSF 1460 non-null int64 45 LowQualFinSF 1460 non-null int64 46 GrLivArea 1460 non-null int64 47 BsmtFullBath 1460 non-null int64 48 BsmtHalfBath 1460 non-null int64 49 FullBath 1460 non-null int64 50 HalfBath 1460 non-null int64 51 BedroomAbvGr 1460 non-null int64 52 KitchenAbvGr 1460 non-null int64 53 KitchenQual 1460 non-null object 54 TotRmsAbvGrd 1460 non-null int64 55 Functional 1460 non-null object 56 Fireplaces 1460 non-null int64 57 FireplaceQu 770 non-null object 58 GarageType 1379 non-null object 59 GarageYrBlt 1379 non-null float64 60 GarageFinish 1379 non-null object 61 GarageCars 1460 non-null int64 62 GarageArea 1460 non-null int64 63 GarageQual 1379 non-null object 64 GarageCond 1379 non-null object 65 PavedDrive 1460 non-null object 66 WoodDeckSF 1460 non-null int64 67 OpenPorchSF 1460 non-null int64 68 EnclosedPorch 1460 non-null int64 69 3SsnPorch 1460 non-null int64 70 ScreenPorch 1460 non-null int64 71 PoolArea 1460 non-null int64 72 PoolQC 7 non-null object 73 Fence 281 non-null object 74 MiscFeature 54 non-null object 75 MiscVal 1460 non-null int64 76 MoSold 1460 non-null int64 77 YrSold 1460 non-null int64 78 SaleType 1460 non-null object 79 SaleCondition 1460 non-null object 80 SalePrice 1460 non-null int64 dtypes: float64(3), int64(35), object(43) memory usage: 924.0+ KB None
InĀ [5]:
cuantitiativas = house_prices.select_dtypes(include=['int64', 'float64']).columns.tolist()
cualitativas = house_prices.select_dtypes(include=['object']).columns.tolist()
print(cuantitiativas)
print(cualitativas)
['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice'] ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
InĀ [6]:
# Utilizados para conocer si las variables pueden representar un valor cualitativo
print(house_prices['MSSubClass'].unique())
print(house_prices['OverallQual'].unique())
print(house_prices['LowQualFinSF'].unique())
print(house_prices['BsmtFullBath'].unique())
print(house_prices['BsmtHalfBath'].unique())
print(house_prices['KitchenAbvGr'].unique())
print(house_prices['MoSold'].unique())
cuantitiativas.remove('MSSubClass')
cuantitiativas.remove('MoSold')
cualitativas.append('MSSubClass')
cualitativas.append('MoSold')
#El Id no nos sirve en el analisis
cuantitiativas.remove('Id')
[ 60 20 70 50 190 45 90 120 30 85 80 160 75 180 40] [ 7 6 8 5 9 4 10 3 1 2] [ 0 360 513 234 528 572 144 392 371 390 420 473 156 515 80 53 232 481 120 514 397 479 205 384] [1 0 2 3] [0 1 2] [1 2 3 0] [ 2 5 9 12 10 8 11 4 1 7 3 6]
InĀ [6]:
#En base a un analisis general se decidio mover estas variables a las cuantitativas para hacer una tabla de frecuencia.
variables_a_mover = [
'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
'GarageCars', 'KitchenAbvGr', 'YrSold', 'Fireplaces'
]
for var in variables_a_mover:
if var in cuantitiativas:
cuantitiativas.remove(var)
cualitativas.append(var)
for col in cuantitiativas:
print("***" + col + "***")
funciones.prueba_de_normalidad(house_prices[col],col)
print("\n")
***LotFrontage*** EstadĆstico de prueba (ks_statistic) = 0.10447205487401189927 p-value = 0.00000000000723401475 Se rechaza la hipótesis nula: los datos de 'LotFrontage' NO provienen de una distribución normal.
***LotArea*** EstadĆstico de prueba (ks_statistic) = 0.25146904348964016496 p-value = 0.00000000000000000000 Se rechaza la hipótesis nula: los datos de 'LotArea' NO provienen de una distribución normal.
***OverallQual*** EstadĆstico de prueba (ks_statistic) = 0.15523045240535604528 p-value = 0.00000000000000000000 Se rechaza la hipótesis nula: los datos de 'OverallQual' NO provienen de una distribución normal.
***OverallCond*** EstadĆstico de prueba (ks_statistic) = 0.32009439633252723123 p-value = 0.00000000000000000000 Se rechaza la hipótesis nula: los datos de 'OverallCond' NO provienen de una distribución normal.
***YearBuilt*** EstadĆstico de prueba (ks_statistic) = 0.12102104059878149300 p-value = 0.00000000000000000043 Se rechaza la hipótesis nula: los datos de 'YearBuilt' NO provienen de una distribución normal.
***YearRemodAdd*** EstadĆstico de prueba (ks_statistic) = 0.17460226465795025685 p-value = 0.00000000000000000000 Se rechaza la hipótesis nula: los datos de 'YearRemodAdd' NO provienen de una distribución normal.
***MasVnrArea*** EstadĆstico de prueba (ks_statistic) = 0.12376406278743445899 p-value = 0.00000002387694009518 Se rechaza la hipótesis nula: los datos de 'MasVnrArea' NO provienen de una distribución normal.
***BsmtFinSF1*** EstadĆstico de prueba (ks_statistic) = 0.07824075493569671025 p-value = 0.00000982578619542059 Se rechaza la hipótesis nula: los datos de 'BsmtFinSF1' NO provienen de una distribución normal.
***BsmtFinSF2*** EstadĆstico de prueba (ks_statistic) = 0.10366232526210483744 p-value = 0.05134981246596848070 No se rechaza la hipótesis nula: los datos de 'BsmtFinSF2' parecen provenir de una distribución normal.
***BsmtUnfSF*** EstadĆstico de prueba (ks_statistic) = 0.08881017317189798810 p-value = 0.00000000116643528412 Se rechaza la hipótesis nula: los datos de 'BsmtUnfSF' NO provienen de una distribución normal.
***TotalBsmtSF*** EstadĆstico de prueba (ks_statistic) = 0.07593952292488681532 p-value = 0.00000009069791178247 Se rechaza la hipótesis nula: los datos de 'TotalBsmtSF' NO provienen de una distribución normal.
***1stFlrSF*** EstadĆstico de prueba (ks_statistic) = 0.08691902539030016417 p-value = 0.00000000047910297651 Se rechaza la hipótesis nula: los datos de '1stFlrSF' NO provienen de una distribución normal.
***2ndFlrSF*** EstadĆstico de prueba (ks_statistic) = 0.08459064426645801493 p-value = 0.00022376280275689071 Se rechaza la hipótesis nula: los datos de '2ndFlrSF' NO provienen de una distribución normal.
***LowQualFinSF*** EstadĆstico de prueba (ks_statistic) = 0.19230487009818358901 p-value = 0.25663718932099255365 No se rechaza la hipótesis nula: los datos de 'LowQualFinSF' parecen provenir de una distribución normal.
***GrLivArea*** EstadĆstico de prueba (ks_statistic) = 0.06746422821216169208 p-value = 0.00000319638564181046 Se rechaza la hipótesis nula: los datos de 'GrLivArea' NO provienen de una distribución normal.
***BedroomAbvGr*** EstadĆstico de prueba (ks_statistic) = 0.28148584016671784891 p-value = 0.00000000000000000000 Se rechaza la hipótesis nula: los datos de 'BedroomAbvGr' NO provienen de una distribución normal.
***TotRmsAbvGrd*** EstadĆstico de prueba (ks_statistic) = 0.16748240064141528549 p-value = 0.00000000000000000000 Se rechaza la hipótesis nula: los datos de 'TotRmsAbvGrd' NO provienen de una distribución normal.
***GarageYrBlt*** EstadĆstico de prueba (ks_statistic) = 0.12757418726210723037 p-value = 0.00000000000000000005 Se rechaza la hipótesis nula: los datos de 'GarageYrBlt' NO provienen de una distribución normal.
***GarageArea*** EstadĆstico de prueba (ks_statistic) = 0.07528339704878073135 p-value = 0.00000012131559844146 Se rechaza la hipótesis nula: los datos de 'GarageArea' NO provienen de una distribución normal.
***WoodDeckSF*** EstadĆstico de prueba (ks_statistic) = 0.12543891505085835725 p-value = 0.00000000047924194542 Se rechaza la hipótesis nula: los datos de 'WoodDeckSF' NO provienen de una distribución normal.
***OpenPorchSF*** EstadĆstico de prueba (ks_statistic) = 0.15742228443806632887 p-value = 0.00000000000000000720 Se rechaza la hipótesis nula: los datos de 'OpenPorchSF' NO provienen de una distribución normal.
***EnclosedPorch*** EstadĆstico de prueba (ks_statistic) = 0.07134036259736109553 p-value = 0.22917263590338987633 No se rechaza la hipótesis nula: los datos de 'EnclosedPorch' parecen provenir de una distribución normal.
***3SsnPorch*** EstadĆstico de prueba (ks_statistic) = 0.18396395854341418552 p-value = 0.34770579091415720896 No se rechaza la hipótesis nula: los datos de '3SsnPorch' parecen provenir de una distribución normal.
***ScreenPorch*** EstadĆstico de prueba (ks_statistic) = 0.13615686603114729447 p-value = 0.02451151820985830732 Se rechaza la hipótesis nula: los datos de 'ScreenPorch' NO provienen de una distribución normal.
***PoolArea*** EstadĆstico de prueba (ks_statistic) = 0.21154495663519734538 p-value = 0.85431846228735208726 No se rechaza la hipótesis nula: los datos de 'PoolArea' parecen provenir de una distribución normal.
***MiscVal*** EstadĆstico de prueba (ks_statistic) = 0.33553350759646333179 p-value = 0.00001007772934356749 Se rechaza la hipótesis nula: los datos de 'MiscVal' NO provienen de una distribución normal.
***SalePrice*** EstadĆstico de prueba (ks_statistic) = 0.12366990729158056084 p-value = 0.00000000000000000006 Se rechaza la hipótesis nula: los datos de 'SalePrice' NO provienen de una distribución normal.
InĀ [7]:
for col in cualitativas:
print("***" + col + "***")
funciones.frecuencias(house_prices[col], col)
print("\n")
***MSZoning***
***Street***
***Alley***
***LotShape***
***LandContour***
***Utilities***
***LotConfig***
***LandSlope***
***Neighborhood***
***Condition1***
***Condition2***
***BldgType***
***HouseStyle***
***RoofStyle***
***RoofMatl***
***Exterior1st***
***Exterior2nd***
***MasVnrType***
***ExterQual***
***ExterCond***
***Foundation***
***BsmtQual***
***BsmtCond***
***BsmtExposure***
***BsmtFinType1***
***BsmtFinType2***
***Heating***
***HeatingQC***
***CentralAir***
***Electrical***
***KitchenQual***
***Functional***
***FireplaceQu***
***GarageType***
***GarageFinish***
***GarageQual***
***GarageCond***
***PavedDrive***
***PoolQC***
***Fence***
***MiscFeature***
***SaleType***
***SaleCondition***
***MSSubClass***
***MoSold***
AnÔlisis de grupos¶
InĀ [8]:
import random
import pyclustertend
import sklearn
InĀ [9]:
# Procesamiento de categoricas
cat_houses = funciones.trans_categorical(house_prices)
num_houses = cat_houses.select_dtypes(include=[float, int])
X = funciones.preprocess(num_houses).dropna()
X.info()
<class 'pandas.core.frame.DataFrame'> Index: 524 entries, 0 to 1459 Data columns (total 36 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 MSSubClass 524 non-null int64 1 LotFrontage 524 non-null float64 2 LotArea 524 non-null int64 3 Alley 524 non-null int32 4 LotShape 524 non-null int32 5 OverallQual 524 non-null int64 6 OverallCond 524 non-null int64 7 YearBuilt 524 non-null int64 8 YearRemodAdd 524 non-null int64 9 MasVnrType 524 non-null int32 10 ExterQual 524 non-null int64 11 ExterCond 524 non-null int64 12 BsmtQual 524 non-null float64 13 TotalBsmtSF 524 non-null int64 14 HeatingQC 524 non-null int64 15 1stFlrSF 524 non-null int64 16 GrLivArea 524 non-null int64 17 BsmtFullBath 524 non-null int64 18 BsmtHalfBath 524 non-null int64 19 FullBath 524 non-null int64 20 HalfBath 524 non-null int64 21 BedroomAbvGr 524 non-null int64 22 KitchenAbvGr 524 non-null int64 23 KitchenQual 524 non-null int64 24 TotRmsAbvGrd 524 non-null int64 25 Fireplaces 524 non-null int64 26 FireplaceQu 524 non-null float64 27 GarageYrBlt 524 non-null float64 28 GarageFinish 524 non-null float64 29 GarageCars 524 non-null int64 30 GarageArea 524 non-null int64 31 PoolQC 524 non-null float64 32 Fence 524 non-null float64 33 MoSold 524 non-null int64 34 YrSold 524 non-null int64 35 SalePrice 524 non-null int64 dtypes: float64(7), int32(3), int64(26) memory usage: 145.3 KB
InĀ [10]:
results_df = funciones.test_random_clusters(X, num_tests=1000, min_cols=3, max_cols=5, random_state=531)
print(results_df.head(10))
Columns Hopkins_Stat 572 [MoSold, GrLivArea, LotFrontage, YrSold, Kitch... 0.366874 423 [LotArea, TotalBsmtSF, BsmtHalfBath, MoSold] 0.352115 220 [LotArea, GrLivArea, MoSold] 0.321489 514 [LotArea, GarageArea, LotFrontage, YrSold] 0.307387 470 [LotArea, YrSold, SalePrice, Alley] 0.302574 93 [PoolQC, YrSold, LotFrontage, GarageArea] 0.297427 199 [PoolQC, GrLivArea, Alley] 0.295881 814 [GrLivArea, YrSold, LotFrontage, GarageYrBlt] 0.294164 137 [MoSold, 1stFlrSF, YrSold] 0.290849 356 [ExterCond, KitchenAbvGr, LotArea] 0.285118
InĀ [11]:
import itertools
columns = ['LotArea', 'SalePrice', 'Fence', 'MoSold', 'LotFrontage', 'Alley', 'GarageArea']
comb_3_cols = list(itertools.combinations(columns, 3))
hopkins_results = {}
# Evaluar la estadĆstica de Hopkins para cada combinación de columnas
for comb in comb_3_cols:
random.seed(123) # Mantener reproducibilidad en cada iteración
# Seleccionar las columnas
X_subset = X[list(comb)].dropna() # Elimina filas con NaN
if X_subset.shape[0] == 0: # Si eliminamos todas las filas, saltamos esta combinación
continue
# Escalar los datos
X_scale = sklearn.preprocessing.scale(X_subset)
# Calcular la estadĆstica de Hopkins
hopkins_stat = pyclustertend.hopkins(X_scale, len(X_scale))
# Guardar el resultado en el diccionario
hopkins_results[comb] = hopkins_stat
# Convertir resultados a DataFrame y ordenarlos de mayor a menor Hopkins
results_df = pd.DataFrame(hopkins_results.items(), columns=['Columnas', 'Hopkins'])
results_df = results_df.sort_values(by='Hopkins', ascending=False)
# Mostrar las mejores combinaciones
print(results_df.head(10))
Columnas Hopkins 19 (SalePrice, MoSold, LotFrontage) 0.365868 11 (LotArea, MoSold, GarageArea) 0.361950 1 (LotArea, SalePrice, MoSold) 0.354471 32 (MoSold, LotFrontage, GarageArea) 0.350483 17 (SalePrice, Fence, Alley) 0.320620 16 (SalePrice, Fence, LotFrontage) 0.312821 22 (SalePrice, LotFrontage, Alley) 0.311904 21 (SalePrice, MoSold, GarageArea) 0.310730 3 (LotArea, SalePrice, Alley) 0.302366 7 (LotArea, Fence, Alley) 0.294925
InĀ [12]:
comb1 = ['SalePrice', 'MoSold', 'LotFrontage']
comb2 = ['LotArea', 'SalePrice', 'MoSold']
comb3 = ['LotArea', 'MoSold', 'GarageArea']
X_cluster1 = X[comb1].dropna()
X_cluster2 = X[comb2].dropna()
X_cluster3 = X[comb3].dropna()
X_scale1 = sklearn.preprocessing.scale(X_cluster1)
X_scale2 = sklearn.preprocessing.scale(X_cluster2)
X_scale3 = sklearn.preprocessing.scale(X_cluster3)
pyclustertend.vat(X_scale1)
pyclustertend.vat(X_scale2)
pyclustertend.vat(X_scale3)
InĀ [13]:
funciones.elbow(X_scale1)
funciones.elbow(X_scale2)
funciones.elbow(X_scale3)
InĀ [17]:
import sklearn
from sklearn.decomposition import PCA
import seaborn as sns
cluster_amount = 3
X_pca1 = PCA(n_components=2).fit_transform(X_scale1)
km = sklearn.cluster.KMeans(n_clusters=cluster_amount, random_state=42).fit(X_pca1)
centroides = km.cluster_centers_
X_cluster1['Cluster'] = km.labels_
sns.scatterplot(x=X_pca1[:, 0], y=X_pca1[:, 1], hue=X_cluster1['Cluster'], palette="tab10", legend="full")
# Plot centroids
plt.scatter(centroides[:, 0], centroides[:, 1], c='red', marker='X', s=200, label="Centroids")
plt.title(f"K-Means Clustering (Reducido con PCA) - 3 Clusters")
plt.legend()
plt.show()
InĀ [18]:
cluster_amount = 3
X_pca2 = PCA(n_components=2).fit_transform(X_scale2)
km = sklearn.cluster.KMeans(n_clusters=cluster_amount, random_state=42).fit(X_pca2)
centroides = km.cluster_centers_
X_cluster2['Cluster'] = km.labels_
sns.scatterplot(x=X_pca2[:, 0], y=X_pca2[:, 1], hue=X_cluster2['Cluster'], palette="tab10", legend="full")
# Plot centroids
plt.scatter(centroides[:, 0], centroides[:, 1], c='red', marker='X', s=200, label="Centroids")
plt.title(f"K-Means Clustering (Reducido con PCA) - 3 Clusters")
plt.legend()
plt.show()
InĀ [19]:
cluster_amount = 2
X_pca3 = PCA(n_components=2).fit_transform(X_scale3)
km = sklearn.cluster.KMeans(n_clusters=cluster_amount, random_state=42).fit(X_pca3)
centroides = km.cluster_centers_
X_cluster3['Cluster'] = km.labels_
sns.scatterplot(x=X_pca3[:, 0], y=X_pca3[:, 1], hue=X_cluster3['Cluster'], palette="tab10", legend="full")
# Plot centroids
plt.scatter(centroides[:, 0], centroides[:, 1], c='red', marker='X', s=200, label="Centroids")
plt.title(f"K-Means Clustering (Reducido con PCA) - 3 Clusters")
plt.legend()
plt.show()
InĀ [13]:
funciones.sillhouette([3,4,5,6,7],X_scale1)
For n_clusters = 3 The average silhouette_score is : 0.2541338598312184 For n_clusters = 4 The average silhouette_score is : 0.2374071343822876 For n_clusters = 5 The average silhouette_score is : 0.2335609092167851 For n_clusters = 6 The average silhouette_score is : 0.22334375088557823 For n_clusters = 7 The average silhouette_score is : 0.22756550051374433
InĀ [14]:
funciones.sillhouette([3,4,5,6,7],X_scale2)
For n_clusters = 3 The average silhouette_score is : 0.24631125522043992 For n_clusters = 4 The average silhouette_score is : 0.23492848903546412 For n_clusters = 5 The average silhouette_score is : 0.2247905373806363 For n_clusters = 6 The average silhouette_score is : 0.23835291505532524 For n_clusters = 7 The average silhouette_score is : 0.23628158290112847
InĀ [16]:
funciones.sillhouette([2,3,4,5,6,7],X_scale3)
For n_clusters = 2 The average silhouette_score is : 0.25944192957193424 For n_clusters = 3 The average silhouette_score is : 0.2444204578443601 For n_clusters = 4 The average silhouette_score is : 0.238052245291672 For n_clusters = 5 The average silhouette_score is : 0.24747884981159957 For n_clusters = 6 The average silhouette_score is : 0.23847928002245364 For n_clusters = 7 The average silhouette_score is : 0.2448580418894601
InĀ [20]:
funciones.plotFeatures1(X_cluster1,3)
InĀ [21]:
funciones.plotFeatures2(X_cluster2,3)
InĀ [22]:
funciones.plotFeatures3(X_cluster3,2)
PARTE 2: ANALISIS LINEAL¶
Model univariado 1: lot area vs Sale Price¶
InĀ [23]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.preprocessing import MinMaxScaler
import sklearn.metrics as metrics
from statsmodels.stats.outliers_influence import variance_inflation_factor
scaler = MinMaxScaler(feature_range=(0, 1))
X.info()
<class 'pandas.core.frame.DataFrame'> Index: 524 entries, 0 to 1459 Data columns (total 36 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 MSSubClass 524 non-null int64 1 LotFrontage 524 non-null float64 2 LotArea 524 non-null int64 3 Alley 524 non-null int32 4 LotShape 524 non-null int32 5 OverallQual 524 non-null int64 6 OverallCond 524 non-null int64 7 YearBuilt 524 non-null int64 8 YearRemodAdd 524 non-null int64 9 MasVnrType 524 non-null int32 10 ExterQual 524 non-null int64 11 ExterCond 524 non-null int64 12 BsmtQual 524 non-null float64 13 TotalBsmtSF 524 non-null int64 14 HeatingQC 524 non-null int64 15 1stFlrSF 524 non-null int64 16 GrLivArea 524 non-null int64 17 BsmtFullBath 524 non-null int64 18 BsmtHalfBath 524 non-null int64 19 FullBath 524 non-null int64 20 HalfBath 524 non-null int64 21 BedroomAbvGr 524 non-null int64 22 KitchenAbvGr 524 non-null int64 23 KitchenQual 524 non-null int64 24 TotRmsAbvGrd 524 non-null int64 25 Fireplaces 524 non-null int64 26 FireplaceQu 524 non-null float64 27 GarageYrBlt 524 non-null float64 28 GarageFinish 524 non-null float64 29 GarageCars 524 non-null int64 30 GarageArea 524 non-null int64 31 PoolQC 524 non-null float64 32 Fence 524 non-null float64 33 MoSold 524 non-null int64 34 YrSold 524 non-null int64 35 SalePrice 524 non-null int64 dtypes: float64(7), int32(3), int64(26) memory usage: 145.3 KB
InĀ [24]:
plt.figure()
plt.scatter(X['GrLivArea'],X['SalePrice'])
plt.title("Lot Area VS Sale Price")
plt.xlabel("Area First Floor (ft^2)")
plt.ylabel("Sale Price ($)")
Out[24]:
Text(0, 0.5, 'Sale Price ($)')
InĀ [25]:
import mrl_func
mrl_func.linear_analysis(X, 'GrLivArea')
EQ: sale_price = 116.6773*lot_area +17554.6483 Mean Squared Error: 1933564010.32 R squared: 0.50 Max Real: [189000] Max Predicho: [321032.27107763] Max Diferencia: [132032.27107763] Residuales: p = 0.0022 (No Normal)
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.608
Model: OLS Adj. R-squared: 0.607
Method: Least Squares F-statistic: 564.8
Date: Sun, 02 Mar 2025 Prob (F-statistic): 4.84e-76
Time: 22:06:22 Log-Likelihood: -4379.3
No. Observations: 366 AIC: 8763.
Df Residuals: 364 BIC: 8770.
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 1.755e+04 7621.515 2.303 0.022 2566.919 3.25e+04
x1 116.6773 4.909 23.766 0.000 107.023 126.332
==============================================================================
Omnibus: 12.534 Durbin-Watson: 1.823
Prob(Omnibus): 0.002 Jarque-Bera (JB): 13.718
Skew: 0.378 Prob(JB): 0.00105
Kurtosis: 3.573 Cond. No. 5.93e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.93e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
InĀ [143]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
mrl_X = X[['GrLivArea','SalePrice']]
mrl_y = X['SalePrice']
X_train, X_test,y_train, y_test= train_test_split(
mrl_X, mrl_y,
test_size=0.3, train_size=0.7,
random_state=42)
ind = X_train['GrLivArea'].values.reshape(-1,1)
ind_t = X_test['GrLivArea'].values.reshape(-1,1)
dep = y_train.values.reshape(-1,1)
dep_t = y_test.values.reshape(-1,1)
lm = LinearRegression()
lm.fit(ind, dep) # Entreno en train
dep_pred = lm.predict(ind_t)
mrl_func.metricas_regresion(ind, dep_t,dep_pred,lm)
explained_variance: 0.5006 mean_squared_log_error: 0.0539 r2: 0.4964 MAE: 31956.4141 MSE: 1933564010.3241 RMSE: 43972.3096 AIC: 7828.0429 BIC: 7831.9455
InĀ [145]:
resid = dep_t-dep_pred
resid_standardized = (resid - np.mean(resid)) / np.std(resid)
sm.qqplot(resid_standardized,line='45')
Out[145]:
Modelo Multivariado¶
InĀ [27]:
print(X.columns)
num_data = X[X.columns]
num_data.dropna()
y_multi = num_data.pop('SalePrice')
X_multi = num_data
X_train, X_test,y_train, y_test = train_test_split(X_multi, y_multi,test_size=0.3,train_size=0.7, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
Index(['MSSubClass', 'LotFrontage', 'LotArea', 'Alley', 'LotShape',
'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrType',
'ExterQual', 'ExterCond', 'BsmtQual', 'TotalBsmtSF', 'HeatingQC',
'1stFlrSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
'TotRmsAbvGrd', 'Fireplaces', 'FireplaceQu', 'GarageYrBlt',
'GarageFinish', 'GarageCars', 'GarageArea', 'PoolQC', 'Fence', 'MoSold',
'YrSold', 'SalePrice'],
dtype='object')
(366, 35)
(158, 35)
(366,)
(158,)
InĀ [28]:
model_m = LinearRegression()
model_m.fit(X_train, y_train)
y_pred = model_m.predict(X_test)
len(model_m.get_params())
sm.OLS(y_train,X_train).fit().summary()
Out[28]:
| Dep. Variable: | SalePrice | R-squared: | 0.897 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.888 |
| Method: | Least Squares | F-statistic: | 100.4 |
| Date: | Sun, 02 Mar 2025 | Prob (F-statistic): | 6.04e-147 |
| Time: | 22:16:07 | Log-Likelihood: | -4135.6 |
| No. Observations: | 366 | AIC: | 8331. |
| Df Residuals: | 336 | BIC: | 8448. |
| Df Model: | 29 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| MSSubClass | 11.7581 | 50.138 | 0.235 | 0.815 | -86.865 | 110.382 |
| LotFrontage | 207.6310 | 97.637 | 2.127 | 0.034 | 15.574 | 399.688 |
| LotArea | 0.9343 | 0.568 | 1.646 | 0.101 | -0.182 | 2.051 |
| Alley | -3.908e+04 | 2.47e+05 | -0.158 | 0.874 | -5.24e+05 | 4.46e+05 |
| LotShape | -996.2574 | 822.680 | -1.211 | 0.227 | -2614.509 | 621.994 |
| OverallQual | 1.197e+04 | 1776.076 | 6.742 | 0.000 | 8480.927 | 1.55e+04 |
| OverallCond | 6131.1826 | 2119.705 | 2.892 | 0.004 | 1961.618 | 1.03e+04 |
| YearBuilt | 248.8886 | 120.790 | 2.060 | 0.040 | 11.288 | 486.489 |
| YearRemodAdd | 49.9061 | 108.296 | 0.461 | 0.645 | -163.118 | 262.930 |
| MasVnrType | 1020.7996 | 1383.528 | 0.738 | 0.461 | -1700.668 | 3742.267 |
| ExterQual | 2902.4408 | 3671.721 | 0.790 | 0.430 | -4320.015 | 1.01e+04 |
| ExterCond | -5.862e+04 | 3.7e+05 | -0.158 | 0.874 | -7.87e+05 | 6.69e+05 |
| BsmtQual | 5751.5218 | 3022.333 | 1.903 | 0.058 | -193.556 | 1.17e+04 |
| TotalBsmtSF | 45.6281 | 11.859 | 3.848 | 0.000 | 22.301 | 68.955 |
| HeatingQC | 1154.2030 | 1759.823 | 0.656 | 0.512 | -2307.457 | 4615.863 |
| 1stFlrSF | -25.3774 | 12.239 | -2.074 | 0.039 | -49.451 | -1.304 |
| GrLivArea | 63.0002 | 7.713 | 8.168 | 0.000 | 47.829 | 78.172 |
| BsmtFullBath | 1.501e+04 | 2380.109 | 6.305 | 0.000 | 1.03e+04 | 1.97e+04 |
| BsmtHalfBath | 1.959e-10 | 1.23e-09 | 0.159 | 0.873 | -2.22e-09 | 2.61e-09 |
| FullBath | -4056.4590 | 4258.122 | -0.953 | 0.341 | -1.24e+04 | 4319.476 |
| HalfBath | -2815.0859 | 4441.648 | -0.634 | 0.527 | -1.16e+04 | 5921.854 |
| BedroomAbvGr | -3622.4997 | 2715.963 | -1.334 | 0.183 | -8964.934 | 1719.934 |
| KitchenAbvGr | -1.954e+04 | 1.23e+05 | -0.158 | 0.874 | -2.62e+05 | 2.23e+05 |
| KitchenQual | 5021.8307 | 3262.576 | 1.539 | 0.125 | -1395.818 | 1.14e+04 |
| TotRmsAbvGrd | -501.6087 | 1672.051 | -0.300 | 0.764 | -3790.616 | 2787.398 |
| Fireplaces | -375.2068 | 5199.551 | -0.072 | 0.943 | -1.06e+04 | 9852.567 |
| FireplaceQu | 1997.6000 | 1587.917 | 1.258 | 0.209 | -1125.911 | 5121.111 |
| GarageYrBlt | -91.1786 | 116.280 | -0.784 | 0.434 | -319.907 | 137.550 |
| GarageFinish | -630.4249 | 1952.653 | -0.323 | 0.747 | -4471.390 | 3210.540 |
| GarageCars | 2715.0937 | 4208.800 | 0.645 | 0.519 | -5563.824 | 1.1e+04 |
| GarageArea | 33.5538 | 15.026 | 2.233 | 0.026 | 3.998 | 63.110 |
| PoolQC | 0 | 0 | nan | nan | 0 | 0 |
| Fence | 0 | 0 | nan | nan | 0 | 0 |
| MoSold | 39.2831 | 428.224 | 0.092 | 0.927 | -803.055 | 881.622 |
| YrSold | -123.7595 | 849.253 | -0.146 | 0.884 | -1794.282 | 1546.763 |
| Omnibus: | 50.440 | Durbin-Watson: | 2.032 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 270.036 |
| Skew: | -0.395 | Prob(JB): | 2.30e-59 |
| Kurtosis: | 7.133 | Cond. No. | 1.29e+16 |
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 2.6e-22. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
InĀ [29]:
mrl_func.metricas_regresion(X_train,y_test, y_pred, model_m)
explained_variance: 0.8988 mean_squared_log_error: 0.0133 r2: 0.8982 MAE: 15500.8321 MSE: 390813735.2864 RMSE: 19769.0095 AIC: 7310.8494 BIC: 7447.4416
InĀ [148]:
resid_modelo1=y_test-y_pred
plt.scatter(y_pred,resid_modelo1)
plt.axhline(0,color='blue')
plt.xlabel('Valores Predictivos')
plt.ylabel('Residuales')
Out[148]:
Text(0, 0.5, 'Residuales')
InĀ [149]:
resid_standardized = (resid_modelo1 - np.mean(resid_modelo1)) / np.std(resid_modelo1)
sm.qqplot(resid_standardized,line='45')
Out[149]:
InĀ [151]:
stat, p = stats.shapiro(resid_standardized)
print(f"Residuales: p = {p:.4f} {'(No Normal)' if p < 0.05 else '(Normal)'}")
Residuales: p = 0.1554 (Normal)
Multiolinealidad¶
InĀ [169]:
vif_data = pd.DataFrame()
vif_data["Feature"] = X_train.columns
vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
print(vif_data)
c:\Python312\Lib\site-packages\statsmodels\regression\linear_model.py:1782: RuntimeWarning: divide by zero encountered in scalar divide return 1 - self.ssr/self.centered_tss
Feature VIF 0 MSSubClass 2.153281 1 LotFrontage 2.054254 2 LotArea 1.979811 3 Alley 0.000000 4 LotShape 1.230583 5 OverallQual 4.437739 6 OverallCond 1.709132 7 YearBuilt 10.265640 8 YearRemodAdd 4.292282 9 MasVnrType 1.382165 10 ExterQual 3.435491 11 ExterCond 0.000000 12 BsmtQual 3.430978 13 TotalBsmtSF 13.774053 14 HeatingQC 2.040281 15 1stFlrSF 13.335854 16 GrLivArea 8.628253 17 BsmtFullBath 1.229705 18 BsmtHalfBath NaN 19 FullBath 3.934465 20 HalfBath 4.086651 21 BedroomAbvGr 2.910650 22 KitchenAbvGr 0.000000 23 KitchenQual 3.737035 24 TotRmsAbvGrd 4.429645 25 Fireplaces 7.636446 26 FireplaceQu 7.654880 27 GarageYrBlt 7.392591 28 GarageFinish 2.115697 29 GarageCars 5.977019 30 GarageArea 6.070420 31 PoolQC NaN 32 Fence NaN 33 MoSold 1.157290 34 YrSold 1.109999
c:\Python312\Lib\site-packages\statsmodels\regression\linear_model.py:1782: RuntimeWarning: invalid value encountered in scalar divide return 1 - self.ssr/self.centered_tss
Sobreajuste¶
InĀ [167]:
r2_train = model_m.score(X_train, y_train)
r2_test= model_m.score(X_test, y_test)
print(f"R2 Train: {r2_train:.4f}")
print(f"R2 Test: {r2_test:.4f}")
R2 Train: 0.8965 R2 Test: 0.8982
Modelo Multiple Ajustado¶
InĀ [31]:
weird_vif = ['Alley', 'ExterCond','BsmtHalfBath', 'KitchenAbvGr', 'PoolQC', 'Fence']
X_ad = X[X.columns]
X_ad = X_ad[[col for col in X_ad.columns if col not in weird_vif]]
y_multi = X_ad.pop('SalePrice')
X_multi = X_ad
X_train, X_test,y_train, y_test = train_test_split(X_multi, y_multi,test_size=0.3,train_size=0.7, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
model_ma = LinearRegression()
model_ma.fit(X_train, y_train)
y_pred = model_ma.predict(X_test)
len(model_ma.get_params())
sm.OLS(y_train,X_train).fit().summary()
(366, 29) (158, 29) (366,) (158,)
Out[31]:
| Dep. Variable: | SalePrice | R-squared (uncentered): | 0.991 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared (uncentered): | 0.990 |
| Method: | Least Squares | F-statistic: | 1226. |
| Date: | Sun, 02 Mar 2025 | Prob (F-statistic): | 1.83e-322 |
| Time: | 22:17:39 | Log-Likelihood: | -4135.6 |
| No. Observations: | 366 | AIC: | 8329. |
| Df Residuals: | 337 | BIC: | 8442. |
| Df Model: | 29 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| MSSubClass | 11.5070 | 50.040 | 0.230 | 0.818 | -86.923 | 109.937 |
| LotFrontage | 206.8772 | 97.380 | 2.124 | 0.034 | 15.328 | 398.426 |
| LotArea | 0.9264 | 0.565 | 1.641 | 0.102 | -0.184 | 2.037 |
| LotShape | -997.8394 | 821.428 | -1.215 | 0.225 | -2613.612 | 617.933 |
| OverallQual | 1.197e+04 | 1773.303 | 6.750 | 0.000 | 8482.171 | 1.55e+04 |
| OverallCond | 6122.0331 | 2115.851 | 2.893 | 0.004 | 1960.094 | 1.03e+04 |
| YearBuilt | 247.3667 | 120.233 | 2.057 | 0.040 | 10.864 | 483.869 |
| YearRemodAdd | 48.8840 | 107.948 | 0.453 | 0.651 | -163.452 | 261.220 |
| MasVnrType | 1019.2712 | 1381.492 | 0.738 | 0.461 | -1698.162 | 3736.704 |
| ExterQual | 2870.6251 | 3660.917 | 0.784 | 0.434 | -4330.502 | 1.01e+04 |
| BsmtQual | 5790.6638 | 3007.855 | 1.925 | 0.055 | -125.872 | 1.17e+04 |
| TotalBsmtSF | 45.6414 | 11.841 | 3.854 | 0.000 | 22.349 | 68.934 |
| HeatingQC | 1159.5775 | 1756.950 | 0.660 | 0.510 | -2296.392 | 4615.547 |
| 1stFlrSF | -25.3963 | 12.220 | -2.078 | 0.038 | -49.434 | -1.359 |
| GrLivArea | 63.0406 | 7.697 | 8.190 | 0.000 | 47.900 | 78.182 |
| BsmtFullBath | 1.504e+04 | 2369.093 | 6.348 | 0.000 | 1.04e+04 | 1.97e+04 |
| FullBath | -3972.3013 | 4218.735 | -0.942 | 0.347 | -1.23e+04 | 4326.070 |
| HalfBath | -2808.9332 | 4435.049 | -0.633 | 0.527 | -1.15e+04 | 5914.933 |
| BedroomAbvGr | -3599.5588 | 2708.174 | -1.329 | 0.185 | -8926.614 | 1727.496 |
| KitchenQual | 5062.3375 | 3247.832 | 1.559 | 0.120 | -1326.241 | 1.15e+04 |
| TotRmsAbvGrd | -524.8147 | 1663.211 | -0.316 | 0.753 | -3796.397 | 2746.767 |
| Fireplaces | -346.7558 | 5188.926 | -0.067 | 0.947 | -1.06e+04 | 9860.009 |
| FireplaceQu | 1981.9456 | 1582.545 | 1.252 | 0.211 | -1130.965 | 5094.856 |
| GarageYrBlt | -91.5156 | 116.092 | -0.788 | 0.431 | -319.872 | 136.841 |
| GarageFinish | -617.0649 | 1948.007 | -0.317 | 0.752 | -4448.850 | 3214.720 |
| GarageCars | 2705.7255 | 4202.293 | 0.644 | 0.520 | -5560.304 | 1.1e+04 |
| GarageArea | 33.6399 | 14.994 | 2.244 | 0.026 | 4.146 | 63.134 |
| MoSold | 25.6012 | 418.817 | 0.061 | 0.951 | -798.224 | 849.426 |
| YrSold | -257.1634 | 109.594 | -2.347 | 0.020 | -472.738 | -41.588 |
| Omnibus: | 50.691 | Durbin-Watson: | 2.032 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 271.947 |
| Skew: | -0.398 | Prob(JB): | 8.86e-60 |
| Kurtosis: | 7.147 | Cond. No. | 5.64e+04 |
Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[3] The condition number is large, 5.64e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
InĀ [91]:
vif_data = pd.DataFrame()
vif_data["Feature"] = X_train.columns
vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
print(vif_data)
Feature VIF 0 MSSubClass 7.354260 1 LotFrontage 42.058209 2 LotArea 27.485757 3 LotShape 3.333275 4 OverallQual 121.419315 5 OverallCond 114.365489 6 YearBuilt 50142.560320 7 YearRemodAdd 40696.586473 8 MasVnrType 9.667346 9 ExterQual 81.064643 10 BsmtQual 64.293433 11 TotalBsmtSF 171.025987 12 HeatingQC 55.708364 13 1stFlrSF 193.987251 14 GrLivArea 125.939981 15 BsmtFullBath 2.042299 16 FullBath 47.048871 17 HalfBath 6.588546 18 BedroomAbvGr 53.339675 19 KitchenQual 68.657787 20 TotRmsAbvGrd 106.957732 21 Fireplaces 16.285712 22 FireplaceQu 16.801936 23 GarageYrBlt 46895.999625 24 GarageFinish 15.682784 25 GarageCars 64.258060 26 GarageArea 57.366204 27 MoSold 7.542560 28 YrSold 42703.890281
InĀ [92]:
mrl_func.metricas_regresion(X_train, y_test, y_pred, model_ma)
explained_variance: 0.8988 mean_squared_log_error: 0.0133 r2: 0.8982 MAE: 15500.8321 MSE: 390813735.2864 RMSE: 19769.0095 AIC: 7298.8494 BIC: 7412.0258
InĀ [130]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, Ridge
from sklearn.model_selection import cross_validate
X_train_scaled = scaler.fit_transform(X_train)
from sklearn.model_selection import ShuffleSplit
alphas = np.logspace(-15, 2, num=50)
ridge = make_pipeline(StandardScaler(),
RidgeCV(alphas=alphas, store_cv_values=True))
cv = ShuffleSplit(n_splits=5, random_state=1)
cv_results = cross_validate(ridge, X_train_scaled, y_train,
cv=cv, scoring="neg_mean_squared_error",
return_train_score=True,
return_estimator=True, n_jobs=2)
train_error = -cv_results["train_score"]
print(f"Error medio cuadrado de la regresión con datos de entrenamiento:\n"
f"{train_error.mean():.3f} ± {train_error.std():.3f}")
test_error = -cv_results["test_score"]
print(f"Error medio cuadrado de la regresion con los datos de prueba:\n"
f"{test_error.mean():.3f} ± {test_error.std():.3f}")
mse_alphas = [est[-1].cv_values_.mean(axis=0)
for est in cv_results["estimator"]]
cv_alphas = pd.DataFrame(mse_alphas, columns=alphas)
cv_alphas
Error medio cuadrado de la regresión con datos de entrenamiento: 382360711.973 ± 20571304.353 Error medio cuadrado de la regresion con los datos de prueba: 463989206.734 ± 176366554.816
c:\Python312\Lib\site-packages\sklearn\utils\deprecation.py:110: FutureWarning: Attribute `cv_values_` is deprecated in version 1.5 and will be removed in 1.7. Use `cv_results_` instead. warnings.warn(msg, category=FutureWarning)
Out[130]:
| 1.000000e-15 | 2.222996e-15 | 4.941713e-15 | 1.098541e-14 | 2.442053e-14 | 5.428675e-14 | 1.206793e-13 | 2.682696e-13 | 5.963623e-13 | 1.325711e-12 | 2.947052e-12 | 6.551286e-12 | 1.456348e-11 | 3.237458e-11 | 7.196857e-11 | 1.599859e-10 | 3.556480e-10 | 7.906043e-10 | 1.757511e-09 | 3.906940e-09 | 8.685114e-09 | 1.930698e-08 | 4.291934e-08 | 9.540955e-08 | 2.120951e-07 | 4.714866e-07 | 1.048113e-06 | 2.329952e-06 | 5.179475e-06 | 1.151395e-05 | 2.559548e-05 | 5.689866e-05 | 1.264855e-04 | 2.811769e-04 | 6.250552e-04 | 1.389495e-03 | 3.088844e-03 | 6.866488e-03 | 1.526418e-02 | 3.393222e-02 | 7.543120e-02 | 1.676833e-01 | 3.727594e-01 | 8.286428e-01 | 1.842070e+00 | 4.094915e+00 | 9.102982e+00 | 2.023590e+01 | 4.498433e+01 | 1.000000e+02 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761602e+08 | 4.761601e+08 | 4.761600e+08 | 4.761597e+08 | 4.761592e+08 | 4.761580e+08 | 4.761552e+08 | 4.761491e+08 | 4.761356e+08 | 4.761058e+08 | 4.760398e+08 | 4.758960e+08 | 4.755888e+08 | 4.749636e+08 | 4.738166e+08 | 4.721267e+08 | 4.706264e+08 | 4.710942e+08 | 4.762541e+08 | 4.921375e+08 |
| 1 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841514e+08 | 4.841513e+08 | 4.841512e+08 | 4.841510e+08 | 4.841504e+08 | 4.841492e+08 | 4.841465e+08 | 4.841406e+08 | 4.841274e+08 | 4.840982e+08 | 4.840338e+08 | 4.838934e+08 | 4.835937e+08 | 4.829848e+08 | 4.818688e+08 | 4.802142e+08 | 4.786617e+08 | 4.787992e+08 | 4.832580e+08 | 4.979798e+08 |
| 2 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834183e+08 | 4.834182e+08 | 4.834182e+08 | 4.834181e+08 | 4.834178e+08 | 4.834173e+08 | 4.834160e+08 | 4.834133e+08 | 4.834072e+08 | 4.833938e+08 | 4.833640e+08 | 4.832983e+08 | 4.831551e+08 | 4.828498e+08 | 4.822311e+08 | 4.811086e+08 | 4.795150e+08 | 4.783657e+08 | 4.798859e+08 | 4.873909e+08 | 5.070359e+08 |
| 3 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738953e+08 | 4.738952e+08 | 4.738951e+08 | 4.738948e+08 | 4.738943e+08 | 4.738930e+08 | 4.738903e+08 | 4.738842e+08 | 4.738706e+08 | 4.738404e+08 | 4.737741e+08 | 4.736293e+08 | 4.733207e+08 | 4.726951e+08 | 4.715570e+08 | 4.699154e+08 | 4.685730e+08 | 4.694600e+08 | 4.756318e+08 | 4.934590e+08 |
| 4 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188281e+08 | 4.188280e+08 | 4.188277e+08 | 4.188272e+08 | 4.188261e+08 | 4.188235e+08 | 4.188179e+08 | 4.188055e+08 | 4.187779e+08 | 4.187171e+08 | 4.185845e+08 | 4.183024e+08 | 4.177330e+08 | 4.167104e+08 | 4.153050e+08 | 4.144753e+08 | 4.164485e+08 | 4.244624e+08 | 4.446428e+08 |
InĀ [131]:
cv_alphas.mean(axis=0).plot(marker="+")
plt.ylabel("Error medio cuadrado\n (menos es mejor)")
plt.xlabel("alpha")
_ = plt.title("Error obtenido en la validación cruzada")
InĀ [132]:
mejores_alphas = [est[-1].alpha_ for est in cv_results["estimator"]]
mejores_alphas
Out[132]:
[9.102981779915227, 9.102981779915227, 9.102981779915227, 9.102981779915227, 9.102981779915227]
InĀ [129]:
print(f"El mejor alfa es:\n"
f"{np.mean(mejores_alphas):.2f} ± {np.std(mejores_alphas):.2f}")
El mejor alfa es: 11.24 ± 4.07
InĀ [137]:
alfa_modelo3 = np.mean(mejores_alphas)
ridge = Ridge(alpha=alfa_modelo3)
ridge.fit(X_train_scaled, y_train)
# Predict the target values on the testing set
X_test_scaled = scaler.transform(X_test)
y_pred_ridge = ridge.predict(X_test_scaled)
InĀ [140]:
mrl_func.metricas_regresion(X_test_scaled, y_test, y_pred_ridge, ridge)
explained_variance: 0.8735 mean_squared_log_error: 0.014 r2: 0.8729 MAE: 17148.1366 MSE: 487937471.5784 RMSE: 22089.3067 AIC: 3218.9003 BIC: 3307.7155
InĀ [141]:
resid_ridge=y_test-y_pred_ridge
plt.scatter(y_pred_ridge,resid_ridge)
plt.axhline(0,color='blue')
plt.xlabel('Valores Predictivos')
plt.ylabel('Residuales')
Out[141]:
Text(0, 0.5, 'Residuales')
InĀ [103]:
resid_standardized = (resid_ridge - np.mean(resid_ridge)) / np.std(resid_ridge)
sm.qqplot(resid_standardized,line='45')
Out[103]: